{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/ahmadizzan/netifier/master/data/raw/test.csv\n",
    "# !wget https://raw.githubusercontent.com/ahmadizzan/netifier/master/data/raw/train.csv\n",
    "\n",
    "# !git clone https://github.com/IndoNLP/indonlu\n",
    "# !git clone https://github.com/IndoNLP/nusax\n",
    "# !git clone https://github.com/ahmadizzan/netifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "import malaya\n",
    "import pandas as pd\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test.csv  train.csv\r\n"
     ]
    }
   ],
   "source": [
    "!ls *.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7773"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "texts = []\n",
    "\n",
    "for f in glob('*.csv'):\n",
    "    df = pd.read_csv(f)\n",
    "    texts.extend(df['original_text'].tolist())\n",
    "    \n",
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['sentence', 'fuel', 'machine', 'others', 'part', 'price', 'service'], dtype='object')\n",
      "Index(['sentence', 'fuel', 'machine', 'others', 'part', 'price', 'service'], dtype='object')\n",
      "Index(['sentence', 'fuel', 'machine', 'others', 'part', 'price', 'service'], dtype='object')\n",
      "Index(['sentence', 'fuel', 'machine', 'others', 'part', 'price', 'service'], dtype='object')\n",
      "Index(['review', 'ac', 'air_panas', 'bau', 'general', 'kebersihan', 'linen',\n",
      "       'service', 'sunrise_meal', 'tv', 'wifi'],\n",
      "      dtype='object')\n",
      "Index(['review', 'ac', 'air_panas', 'bau', 'general', 'kebersihan', 'linen',\n",
      "       'service', 'sunrise_meal', 'tv', 'wifi'],\n",
      "      dtype='object')\n",
      "Index(['review', 'ac', 'air_panas', 'bau', 'general', 'kebersihan', 'linen',\n",
      "       'service', 'sunrise_meal', 'tv', 'wifi'],\n",
      "      dtype='object')\n",
      "Index(['review', 'ac', 'air_panas', 'bau', 'general', 'kebersihan', 'linen',\n",
      "       'service', 'sunrise_meal', 'tv', 'wifi'],\n",
      "      dtype='object')\n",
      "Index(['label', 'tweet'], dtype='object')\n",
      "Index(['label', 'tweet'], dtype='object')\n",
      "Index(['label', 'tweet'], dtype='object')\n",
      "Index(['label', 'tweet'], dtype='object')\n",
      "Index(['question', 'passage', 'seq_label'], dtype='object')\n",
      "'tweet'\n",
      "Index(['question', 'passage', 'seq_label'], dtype='object')\n",
      "'tweet'\n",
      "Index(['question', 'passage', 'seq_label'], dtype='object')\n",
      "'tweet'\n",
      "Index(['question', 'passage', 'seq_label'], dtype='object')\n",
      "'tweet'\n",
      "Index(['sent_A', 'sent_B', 'category', 'label'], dtype='object')\n",
      "Index(['sent_A', 'sent_B', 'category', 'label'], dtype='object')\n",
      "Index(['sent_A', 'sent_B', 'category', 'label'], dtype='object')\n",
      "Index(['sent_A', 'sent_B', 'category', 'label'], dtype='object')\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "18114"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for f in glob('indonlu/dataset/*/*.csv'):\n",
    "    df = pd.read_csv(f)\n",
    "    print(df.columns)\n",
    "    if 'review' in df:\n",
    "        c = 'review'\n",
    "    if 'sentence' in df:\n",
    "        c = 'sentence'\n",
    "    if 'tweet' in df:\n",
    "        c = 'tweet'\n",
    "    if 'sent_A' in df:\n",
    "        c = 'sent_A'\n",
    "    try:\n",
    "        texts.extend(df[c].tolist())\n",
    "        if 'sent_B' in df:\n",
    "            texts.extend(df['sent_B'].tolist())\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        \n",
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "31374"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for f in glob('indonlu/dataset/*/*.tsv'):\n",
    "    df = pd.read_csv(f, sep = '\\t', header = None)\n",
    "    texts.extend(df[0].tolist())\n",
    "    \n",
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "42374"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nusax = glob('nusax/datasets/sentiment/*/*.csv')\n",
    "nusax = [f for f in nusax if 'english' not in f]\n",
    "for f in nusax:\n",
    "    df = pd.read_csv(f)\n",
    "    texts.extend(df['text'].tolist())\n",
    "    \n",
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3966114"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files = glob('/home/husein/ssd3/id-texts/*.txt')\n",
    "for f in files:\n",
    "    with open(f) as fopen:\n",
    "        txt = fopen.read().split('\\n')\n",
    "    texts.extend(txt)\n",
    "    \n",
    "len(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_fopen = open('prepare-indon.jsonl', 'w')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████████████████████████████| 3966114/3966114 [00:03<00:00, 1256278.36it/s]\n"
     ]
    }
   ],
   "source": [
    "for t in tqdm(texts):\n",
    "    id_fopen.write(f'{json.dumps(t)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_fopen.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_news = '/home/husein/ssd3/id-texts/ccnews-id.jsonl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_fopen = open('prepare-indon-standard.jsonl', 'w')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "3102398it [00:35, 86420.46it/s]\n"
     ]
    }
   ],
   "source": [
    "with open(id_news) as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        data = json.loads(l)\n",
    "        t = data['text']\n",
    "        id_fopen.write(f'{json.dumps(t)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "id_fopen.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7068512 prepare-indon.jsonl\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l prepare-indon.jsonl"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
